{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this code, we concatenate the duration data. Only used to complete some company names missing (mistake of the data).\n",
    "\n",
    "The last historical is 'CA_historical_through_may_2022.csv', it ecompases all the previous ones."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "#import string as str\n",
    "import numpy as np\n",
    "#import plotly.express as px\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.dates as mdates\n",
    "import os\n",
    "import re\n",
    "import sys\n",
    "import polars as pl\n",
    "from datetime import datetime\n",
    "from dateutil.relativedelta import relativedelta\n",
    "sys.path.append(\"../\")\n",
    "import tools\n",
    "\n",
    "pd.set_option('display.max_rows',12)\n",
    "pd.set_option('display.max_columns',12)\n",
    "pd.options.display.float_format = '{:,.5f}'.format\n",
    "pd.get_option('display.max_columns', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# CHANGE: Declaring data paths\n",
    "path_indeed = '' # raw data \n",
    "path_data = '' # analysis data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create the list\n",
    "duration_files = [\n",
    "    'CA_duration_20230224.csv',\n",
    "    'CA_duration_feb_22.csv',\n",
    "    'CA_duration_20230210.csv',\n",
    "    'CA_Batch_Update_20221125_with_replacements.csv', # This is different from PY's folder in indeed_naics (he uses the one that does not say \"with replacement\")\n",
    "    'CA_duration_20220607.csv',\n",
    "    'CA_historical_through_may_2022.csv'\n",
    "    ]\n",
    "\n",
    "new_filenames = [\n",
    "    name for name in os.listdir(path_indeed)\n",
    "    if re.match(r'CA_duration_\\d{8}.csv', name)\n",
    "    and re.search(r'\\d{8}', name)[0] > '20220505'  # no need for files before historical may 2022\n",
    "    and name not in duration_files\n",
    "    ]\n",
    "\n",
    "duration_files.extend(new_filenames)    \n",
    "duration_files.sort(\n",
    "    key=lambda filename: os.path.getmtime(path_indeed+filename), reverse=True)\n",
    "\n",
    "print(duration_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assemble the duration data\n",
    "# define the columns\n",
    "cols = ['job_key', 'company_name', 'job_title', 'first_date', 'last_date', 'duration_days', 'searchable_days', 'jlcity', 'jladmin1code', 'job_country']\n",
    "\n",
    "# define the functions\n",
    "def create_list_datasets(latest_file):  ## by default most recent file\n",
    "#def create_list_datasets(duration_files):\n",
    "    \n",
    "    index_latest_file = duration_files.index(latest_file)  \n",
    "    dfs = []\n",
    "    for i, file in enumerate(duration_files[index_latest_file:]):\n",
    "    #for i, file in enumerate(duration_files):\n",
    "\n",
    "        try:\n",
    "            file_path = os.path.join(path_indeed, file)\n",
    "            print(\"Reading:\", file_path)  # Print the file path\n",
    "            df = pl.read_csv(path_indeed + file, try_parse_dates=True, columns=cols)\n",
    "            if 'duration_days' in df.columns:\n",
    "                df = df.with_columns([\n",
    "                # replace 'NA' and other non-numeric strings with None\n",
    "                df['duration_days'].apply(lambda x: None if x in ['NA', 'na', ''] else x)\n",
    "                .cast(pl.Int64)\n",
    "                #.fill_none(None)\n",
    "                .alias('duration_days')\n",
    "                ])\n",
    "        except:\n",
    "            file_path = os.path.join(path_indeed, file)\n",
    "            print(\"Reading:\", file_path)  # Print the file path\n",
    "            df = pl.read_csv(path_indeed + file, try_parse_dates=True, columns=cols, ignore_errors=True)\n",
    "            if 'duration_days' in df.columns:\n",
    "                    df = df.with_columns([\n",
    "                    # replace 'NA' and other non-numeric strings with None\n",
    "                    df['duration_days'].apply(lambda x: None if x in ['NA', 'na', ''] else x)\n",
    "                    .cast(pl.Int64)\n",
    "                    #.fill_none(None)\n",
    "                    .alias('duration_days')\n",
    "                    ])\n",
    "        df = df.with_columns(pl.lit(i).alias('version'))\n",
    "\n",
    "        # quick exploration\n",
    "        print(file)\n",
    "        print(df.shape)\n",
    "        print(df.columns)\n",
    "        print(df.dtypes)\n",
    "        print('Max last day = ', df['last_date'].max())\n",
    "        print('Max first day = ', df['first_date'].max())\n",
    "        print('Min last day = ', df['last_date'].min())\n",
    "        print('Min first day = ', df['first_date'].min())\n",
    "        max_last_date = df['last_date'].max() # see the censored observations\n",
    "        df = df.with_columns(\n",
    "            pl.when(pl.col('last_date') == max_last_date).then(True).otherwise(False).alias('censored')\n",
    "        )\n",
    "        print('Censored')\n",
    "        print(df['censored'].value_counts(sort=True).select(\n",
    "            pl.col('censored'),\n",
    "            pl.col('counts'),\n",
    "            pl.col('counts').apply(lambda x: x/df.shape[0]*100).round(2).alias('perc')\n",
    "            )\n",
    "        )\n",
    "        df = df.with_columns( # see whether company name is missing\n",
    "            pl.when((pl.col('company_name').is_null()) | (pl.col('company_name').is_in(['NA','']))).then(True).otherwise(False).alias('no_company')\n",
    "        )\n",
    "        print('No company = ')\n",
    "        print(df['no_company'].value_counts(sort=True).select(\n",
    "            pl.col('no_company'),\n",
    "            pl.col('counts'),\n",
    "            pl.col('counts').apply(lambda x: x/df.shape[0]*100).round(2).alias('perc')\n",
    "            )\n",
    "        )\n",
    "        print(df.head())\n",
    "   \n",
    "        dfs.append(df)\n",
    "    \n",
    "    return dfs\n",
    "\n",
    "def concat_clean_duration_datasets(dfs, filter_last_date):\n",
    "\n",
    "    canadian_provinces = [\n",
    "        'ab', 'bc', 'mb', 'nb', 'nl', 'ns', 'nt', 'nu', 'on', 'pe', 'qc', 'sk',\n",
    "        'yt', 'unknown']\n",
    "\n",
    "    df = (\n",
    "        pl.concat(dfs, how='vertical')\n",
    "        .filter(\n",
    "            (pl.col('job_country') == 'CA') & \n",
    "            pl.col('jladmin1code').is_in(canadian_provinces) &\n",
    "            pl.col('company_name').is_not_null() &\n",
    "            ~pl.col('company_name').is_in(['NA', ''])\n",
    "        )\n",
    "        .drop('job_country')\n",
    "        .sort('version')\n",
    "        .groupby('job_key')\n",
    "        .agg(\n",
    "            [\n",
    "                pl.max('company_name'), # picks non-null company (if there is one)\n",
    "                pl.min('first_date'), \n",
    "                pl.max('last_date'),\n",
    "                pl.max('jlcity'),\n",
    "                pl.first('jladmin1code'), # picks the most recent (as files are in descending order)\n",
    "                pl.max('job_title'), # picks non-null job_title\n",
    "                pl.max('duration_days'),\n",
    "                pl.max('searchable_days')\n",
    "            ]\n",
    "        )\n",
    "        #.with_columns(pl.lit(1).alias('vacancies')) # use this when merging to the inflows data\n",
    "    )\n",
    "\n",
    "    if filter_last_date:\n",
    "        df = df.filter(pl.col('last_date') < pl.col('last_date').max())\n",
    "\n",
    "    return df\n",
    "\n",
    "def assemble_duration_data(last_file=duration_files[0], filter_last_date=False):\n",
    "    dfs = create_list_datasets(last_file)\n",
    "    df_duration = concat_clean_duration_datasets(dfs, filter_last_date)\n",
    "    return df_duration\n",
    "\n",
    "# other functions are defined in tools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# construct data\n",
    "\n",
    "df = assemble_duration_data()\n",
    "\n",
    "# this data cleaning it's not working so far\n",
    "df = tools.clean_city_name(df)\n",
    "df = tools.clean_company_name(df)\n",
    "\n",
    "df = (\n",
    "    df\n",
    "    .with_columns(\n",
    "        [\n",
    "            pl.when(pl.col('jlcity').is_null() | (pl.col('jlcity') == ''))\n",
    "            .then('unknown')\n",
    "            .otherwise(pl.col('jlcity'))\n",
    "            .alias('jlcity')\n",
    "        ]\n",
    "    )\n",
    "    # .filter(pl.col('company_name') != '')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# saving \n",
    "df.write_parquet(path_data+'dfdur.parquet')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.11 ('env_indeed2')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "vscode": {
   "interpreter": {
    "hash": "7120590dfa35e6512fb14e5e70b67446c3e78c7a5c027e908dbb14d6a3f8a0eb"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
